# -*- coding: utf-8 -*-
# @日期    : 2021/11/25 19:48
# @作者  : 万方名
# @FileName: reprocess.py

import sys
import random

random.seed(10)


# make data
def loadfmap(fname):
    fmap = {}
    nmap = {}

    for l in open(fname):
        arr = l.split()
        if arr[0].find('.') != -1:
            idx = int(arr[0].strip('.'))
            assert idx not in fmap
            fmap[idx] = {}
            ftype = arr[1].strip(':')
            content = arr[2]
        else:
            content = arr[0]
        for it in content.split(','):
            if it.strip() == '':
                continue
            k, v = it.split('=')
            fmap[idx][v] = len(nmap)
            nmap[len(nmap)] = ftype + '=' + k
    return fmap, nmap


def write_nmap(fo, nmap):
    for i in range(len(nmap)):
        fo.write('%d\t%s\ti\n' % (i, nmap[i]))


# start here
# make data
fmap, nmap = loadfmap('../data/agaricus-lepiota.fmap')
fo = open('../data/featmap.txt', 'w')
write_nmap(fo, nmap)
fo.close()

fo = open('../data/agaricus.txt', 'w')
for l in open('../data/agaricus-lepiota.data'):
    arr = l.split(',')
    if arr[0] == 'p':
        fo.write('1')
    else:
        assert arr[0] == 'e'
        fo.write('0')
    for i in range(1, len(arr)):
        fo.write(' %d:1' % fmap[i][arr[i].strip()])
    fo.write('\n')

fo.close()

k = 1

fi = open('../data/agaricus.txt', 'r')
ftr = open('../data/agaricus.txt' + '.train', 'w')
fte = open('../data/agaricus.txt' + '.test', 'w')
for l in fi:
    if random.randint(1, 5) == k:
        fte.write(l)
    else:
        ftr.write(l)

fi.close()
ftr.close()
fte.close()


